R libraries.

library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)

library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())

#dat_path = "~/research/data/MorPhiC/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = "../../MorPhiC/data/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")

Read in meta data

meta_flat = read_excel("data/JAX_RNAseq_ExtraEmbryonic_meta_falttened.xlsx")
meta_flat = as.data.frame(meta_flat)
dim(meta_flat)
## [1] 90 37
meta_flat[1:2,]
##                                                                                                  Dataset
## 1 JAX; CRISPR-Cas9 KO; Primitive Syncytium, Extra-embryonic mesoderm; Bulk RNA-seq; 10X CRISPR; Illumina
## 2 JAX; CRISPR-Cas9 KO; Primitive Syncytium, Extra-embryonic mesoderm; Bulk RNA-seq; 10X CRISPR; Illumina
##                                          Counts file
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002
##                                                 Sequence file read 1
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002_R1_001.fastq.gz
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002_R1_001.fastq.gz
##                                                 Sequence file read 2
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002_R2_001.fastq.gz
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002_R2_001.fastq.gz
##   INPUT LIBRARY PREPARATION ID                 RUN ID
## 1                   GT23-11450 20230918_23-robson-010
## 2                   GT23-11449 20230918_23-robson-010
##   LIBRARY AVERAGE FRAGMENT SIZE LIBRARY INPUT AMOUNT VALUE
## 1                           406                        300
## 2                           402                        300
##   LIBRARY INPUT AMOUNT UNIT LIBRARY FINAL YIELD VALUE LIBRARY FINAL YIELD UNIT
## 1                       ngs                     168.8                      ngs
## 2                       ngs                     199.6                      ngs
##   LIBRARY CONCENTRATION VALUE LIBRARY CONCENTRATION UNIT LIBRARY PCR CYCLES
## 1                        47.2                         nM                 10
## 2                        66.8                         nM                 10
##   LIBRARY PCR CYCLES FOR SAMPLE INDEX DIFFERENTIATED CELL LINE ID
## 1                                  NA           PrS-MOK20026W-H02
## 2                                  NA           PrS-MOK20026W-G02
##                                                       DIFFERENTIATED CELL LINE DESCRIPTION
## 1 KOLF2.2 deleted PPARG by deletion of full coding region (KO) derived primitive syncytium
## 2 KOLF2.2 deleted PPARG by deletion of full coding region (KO) derived primitive syncytium
##   TIMEPOINT TIME-POINT UNIT TERMINALLY DIFFERENTIATED?
## 1         6            days                        Yes
## 2         6            days                        Yes
##                                Model System INPUT CELL LINE ID CLONE ID
## 1 extra-embryonic primitive syncytial cells      MOK20026W-H02      H02
## 2 extra-embryonic primitive syncytial cells      MOK20026W-G02      G02
##    CELL LINE ID CELL LINE TYPE DERIVED FROM CELL LINE NAME
## 1 MOK20026W-H02           iPSC                    KOLF2.2J
## 2 MOK20026W-G02           iPSC                    KOLF2.2J
##                                          CELL LINE DESCRIPTION
## 1 KOLF2.2 deleted PPARG by deletion of full coding region (KO)
## 2 KOLF2.2 deleted PPARG by deletion of full coding region (KO)
##   ALTERED GENE SYMBOLS ALTERATION TYPE TARGETED GENOMIC REGION
## 1                PPARG        deletion full coding region (KO)
## 2                PPARG        deletion full coding region (KO)
##   TARGETED GENOMIC REGION COORDINATES                            GUIDE SEQUENCE
## 1              chr3:12379702-12417187 CAACCATGGTCATTTCTGAA;GATCTTCTATGAAAGAGGGT
## 2              chr3:12379702-12417187 CAACCATGGTCATTTCTGAA;GATCTTCTATGAAAGAGGGT
##   ZYGOSITY GENE EXPRESSION ALTERATION PROTOCOL ID
## 1      N.A                               JAXPE001
## 2      N.A                               JAXPE001
##   LIBRARY PREPARATION PROTOCOL ID SEQUENCING PROTOCOL ID
## 1                        JAXPL001               JAXPS001
## 2                        JAXPL001               JAXPS001
##   DIFFERENTIATION PROTOCOL ID
## 1                    JAXPD001
## 2                    JAXPD001
names(meta_flat)
##  [1] "Dataset"                               
##  [2] "Counts file"                           
##  [3] "Sequence file read 1"                  
##  [4] "Sequence file read 2"                  
##  [5] "INPUT LIBRARY PREPARATION ID"          
##  [6] "RUN ID"                                
##  [7] "LIBRARY AVERAGE FRAGMENT SIZE"         
##  [8] "LIBRARY INPUT AMOUNT VALUE"            
##  [9] "LIBRARY INPUT AMOUNT UNIT"             
## [10] "LIBRARY FINAL YIELD VALUE"             
## [11] "LIBRARY FINAL YIELD UNIT"              
## [12] "LIBRARY CONCENTRATION VALUE"           
## [13] "LIBRARY CONCENTRATION UNIT"            
## [14] "LIBRARY PCR CYCLES"                    
## [15] "LIBRARY PCR CYCLES FOR SAMPLE INDEX"   
## [16] "DIFFERENTIATED CELL LINE ID"           
## [17] "DIFFERENTIATED CELL LINE DESCRIPTION"  
## [18] "TIMEPOINT"                             
## [19] "TIME-POINT UNIT"                       
## [20] "TERMINALLY DIFFERENTIATED?"            
## [21] "Model System"                          
## [22] "INPUT CELL LINE ID"                    
## [23] "CLONE ID"                              
## [24] "CELL LINE ID"                          
## [25] "CELL LINE TYPE"                        
## [26] "DERIVED FROM CELL LINE NAME"           
## [27] "CELL LINE DESCRIPTION"                 
## [28] "ALTERED GENE SYMBOLS"                  
## [29] "ALTERATION TYPE"                       
## [30] "TARGETED GENOMIC REGION"               
## [31] "TARGETED GENOMIC REGION COORDINATES"   
## [32] "GUIDE SEQUENCE"                        
## [33] "ZYGOSITY"                              
## [34] "GENE EXPRESSION ALTERATION PROTOCOL ID"
## [35] "LIBRARY PREPARATION PROTOCOL ID"       
## [36] "SEQUENCING PROTOCOL ID"                
## [37] "DIFFERENTIATION PROTOCOL ID"

Gather cell line id information

meta_flat$`DIFFERENTIATED CELL LINE ID`[1:5]
## [1] "PrS-MOK20026W-H02" "PrS-MOK20026W-G02" "PrS-MOK20026W-E02"
## [4] "PrS-MOK20026P-E12" "PrS-MOK20026P-D10"
cell_line_id = str_split(meta_flat$`DIFFERENTIATED CELL LINE ID`, pattern = '-')
table(sapply(cell_line_id, length))
## 
##  3  4 
## 66 24
cell_line_id = lapply(cell_line_id, function(x) { length(x) <- 4; x})
cell_line_id = as.data.frame(do.call(rbind, cell_line_id))
dim(cell_line_id)
## [1] 90  4
names(cell_line_id) = c("model_system", "cell_line", "clone_id", "condition")
cell_line_id[1:2,]
##   model_system cell_line clone_id condition
## 1          PrS MOK20026W      H02      <NA>
## 2          PrS MOK20026W      G02      <NA>
cell_line_id$condition[which(is.na(cell_line_id$condition))] = "nor"

table(meta_flat$`Model System`, cell_line_id$model_system)
##                                            
##                                             ExM PrS
##   extra-embryonic mesenchymal cells          12   0
##   extra-embryonic primitive syncytial cells   0  78
table(cell_line_id$model_system, cell_line_id$condition, useNA="ifany")
##      
##       hyp nor
##   ExM   0  12
##   PrS  12  66

Add run_id and ko_strategy.

Run ID “20231004_23-robson-008-run2” is very similar to “20230918_23-robson-008” in experiment. It is merged into “20230918_23-robson-008”.

table(meta_flat$`RUN ID`)
## 
## 20230809_23-robson-005-run2      20230918_23-robson-008 
##                          30                          23 
##      20230918_23-robson-009      20230918_23-robson-010 
##                          12                          12 
## 20231004_23-robson-008-run2      20231004_23-robson-011 
##                           1                          12
w2update = which(meta_flat$`RUN ID` == "20231004_23-robson-008-run2")
meta_flat$run_id = meta_flat$`RUN ID`
meta_flat$run_id[w2update] = "20230918_23-robson-008"

table(meta_flat$run_id, cell_line_id$model_system, useNA="ifany")
##                              
##                               ExM PrS
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008        0  24
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-011       12   0
table(meta_flat$run_id, cell_line_id$condition, useNA="ifany")
##                              
##                               hyp nor
##   20230809_23-robson-005-run2   0  30
##   20230918_23-robson-008       12  12
##   20230918_23-robson-009        0  12
##   20230918_23-robson-010        0  12
##   20231004_23-robson-011        0  12
table(meta_flat$`TARGETED GENOMIC REGION`, useNA="ifany")
## 
##                critical exon (CE)           full coding region (KO) 
##                                24                                24 
## premature termination codon (PTC)                              <NA> 
##                                24                                18
meta_flat$ko_strategy = str_extract(meta_flat$`TARGETED GENOMIC REGION`, 
                                    "\\(\\S+\\)$")
meta_flat$ko_strategy = str_remove_all(meta_flat$ko_strategy, "\\(|\\)")
meta_flat$ko_strategy[which(is.na(meta_flat$ko_strategy))] = "WT"

table(meta_flat$`TARGETED GENOMIC REGION`, meta_flat$ko_strategy, 
      useNA="ifany")
##                                    
##                                     CE KO PTC WT
##   critical exon (CE)                24  0   0  0
##   full coding region (KO)            0 24   0  0
##   premature termination codon (PTC)  0  0  24  0
##   <NA>                               0  0   0 18
meta_flat$ko_gene = meta_flat$`ALTERED GENE SYMBOLS`
meta_flat$ko_gene[which(is.na(meta_flat$ko_gene))] = "WT"

table(cell_line_id$model_system, meta_flat$ko_gene)
##      
##       EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   ExM     0    0    0     0    9      0     0  3
##   PrS    18    9    9     9    0      9     9 15
table(meta_flat$run_id, meta_flat$ko_gene)
##                              
##                               EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   20230809_23-robson-005-run2     0    9    0     9    0      9     0  3
##   20230918_23-robson-008         18    0    0     0    0      0     0  6
##   20230918_23-robson-009          0    0    9     0    0      0     0  3
##   20230918_23-robson-010          0    0    0     0    0      0     9  3
##   20231004_23-robson-011          0    0    0     0    9      0     0  3
table(meta_flat$ko_strategy, meta_flat$ko_gene)
##      
##       EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
##   CE      6    3    3     3    3      3     3  0
##   KO      6    3    3     3    3      3     3  0
##   PTC     6    3    3     3    3      3     3  0
##   WT      0    0    0     0    0      0     0 18

Read in gene count data and filter genes

meta = cbind(meta_flat, cell_line_id)
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)
## [1] 38592    91
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]
##              Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674                                                   0
## 2 ENSG00000271254                                                1030
##   PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1                                                      0
## 2                                                    489
##   PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1                                                      0
## 2                                                    603
meta$file_id = meta$`Counts file`

setequal(names(cts)[-1], meta$file_id)
## [1] TRUE
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)
## 
## TRUE 
##   90
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name

Read in gene annoation

gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)
## [1] 62754     8
gene_anno[1:2,]
##                geneId    chr strand     start       end ensembl_gene_id
##                <char> <char> <char>     <int>     <int>          <char>
## 1: ENSG00000000003.16   chrX      - 100627108 100639991 ENSG00000000003
## 2:  ENSG00000000005.6   chrX      + 100584936 100599885 ENSG00000000005
##    hgnc_symbol                                       description
##         <char>                                            <char>
## 1:      TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2:        TNMD   tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(rownames(cts_dat) %in% gene_anno$ensembl_gene_id)
## 
##  TRUE 
## 38592
# all ensembl gene IDs are contained in the annotation
setdiff(rownames(cts_dat), gene_anno$ensembl_gene_id)
## character(0)

Discard genes whose expression is 0 in more than 75% of samples

model_s = meta$model_system
get_q75 <- function(x){quantile(x, probs = 0.75)}

gene_info = data.frame(Name = cts$Name, 
                       t(apply(cts_dat, 1, tapply, model_s, min)), 
                       t(apply(cts_dat, 1, tapply, model_s, median)), 
                       t(apply(cts_dat, 1, tapply, model_s, get_q75)), 
                       t(apply(cts_dat, 1, tapply, model_s, max)))

dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM PrS ExM.1 PrS.1  ExM.2 PrS.2 ExM.3 PrS.3
## ENSG00000268674 ENSG00000268674   0   0   0.0   0.0   0.00     0     0     3
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25   923   948  1169
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4), 
                               rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674       0       0        0.0        0.0    0.00
## ENSG00000271254 ENSG00000271254     634     387      812.5      775.5  848.25
##                 PrS_q75 ExM_max PrS_max
## ENSG00000268674       0       0       3
## ENSG00000271254     923     948    1169
summary(gene_info[,-1])
##     ExM_min            PrS_min           ExM_median         PrS_median      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0   Median :     2.0   Median :     1.0  
##  Mean   :   376.3   Mean   :   255.1   Mean   :   530.5   Mean   :   585.1  
##  3rd Qu.:   120.0   3rd Qu.:    60.0   3rd Qu.:   188.5   3rd Qu.:   182.5  
##  Max.   :512761.0   Max.   :154723.0   Max.   :797550.5   Max.   :379480.0  
##     ExM_q75            PrS_q75            ExM_max            PrS_max      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     1  
##  Median :     3.0   Median :     3.0   Median :     6.0   Median :    10  
##  Mean   :   608.8   Mean   :   712.7   Mean   :   751.6   Mean   :  1102  
##  3rd Qu.:   228.0   3rd Qu.:   233.8   3rd Qu.:   285.0   3rd Qu.:   405  
##  Max.   :886970.0   Max.   :456738.2   Max.   :928104.0   Max.   :801404
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)
##        
##         FALSE  TRUE
##   FALSE 12757   959
##   TRUE   2139 22737
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)
## [1] 25835    90
gene_info = gene_info[w2kp,]

meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)

ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "KO type", shape = "Model") + 
  scale_color_brewer(palette = "Set1")

meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")

ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = condition)) +
    geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") + 
  scale_shape_manual(values = c(7, 16)) + 
  xlab("read depth (million)") + ylab("75 percentile of gene expression") + 
  labs(color = "Run ID", shape = "Condition") + 
  scale_color_brewer(palette = "Set1")

Check the effect of hypoxia vs. normoxia among WT

sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m    = meta[sample2kp,]
summary(meta_m$q75)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   361.0   422.2   469.8   478.4   540.5   592.0
stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))

q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
## [1]    18 23865
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
## [1] 0.41336077 0.18367275 0.08208396 0.04567809 0.03608958
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
## [1] 41.3 18.4  8.2  4.6  3.6
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=model_system)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="KO gene", shape ="KO strategy") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

Check the impact of run_id and hypoxia vs. normoxia effect on WT

gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=run_id_short)) +
  geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
  labs(color="KO gene", shape ="KO strategy") + 
  xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
  ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
  ggtitle("Wild type samples") + 
  guides(
    color = guide_legend(title = NULL, order = 1),
    shape = guide_legend(title = NULL, order = 2)
  ) +
  theme(
    legend.margin = margin(0, 0, 0, 0), 
    legend.box.just = "left", 
    legend.direction = "vertical" 
  )

print(gPC)

table(meta_m$run_id, meta_m$model_system)
##                              
##                               ExM PrS
##   20230809_23-robson-005-run2   0   3
##   20230918_23-robson-008        0   6
##   20230918_23-robson-009        0   3
##   20230918_23-robson-010        0   3
##   20231004_23-robson-011        3   0

DE analysis with respect to model system and condition (hypoxia vs. normoxia)

## Generate sample information matrix

cols2kp = c("model_system", "condition", "q75")
sample_info = meta_m[,cols2kp]

dim(sample_info)
## [1] 18  3
sample_info[1:2,]
##    model_system condition   q75
## 12          PrS       nor 569.5
## 68          PrS       hyp 545.0
sample_info$log_q75 = log(sample_info$q75)

dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info, 
                                  ~ model_system + condition + log_q75)
dds = DESeq(dds)

## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
## [1] 25835     6
res_rd[1:2,]
##                  baseMean log2FoldChange     lfcSE       stat    pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 0.04954726 0.9604832
## ENSG00000277196  46.50349      0.9817827 1.3388605 0.73329722 0.4633772
##                      padj
## ENSG00000271254 0.9997047
## ENSG00000277196 0.9997047
table(is.na(res_rd$padj))
## 
## FALSE  TRUE 
## 17751  8084
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("Read depth")
print(g0)

## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ condition + log_q75)
res_lrt = results(dds_lrt)

res_model_system = as.data.frame(res_lrt)
dim(res_model_system)
## [1] 25835     6
res_model_system[1:2,]
##                  baseMean log2FoldChange     lfcSE       stat     pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 4.06378556 0.04381218
## ENSG00000277196  46.50349      0.9817827 1.3388605 0.04552192 0.83104721
##                       padj
## ENSG00000271254 0.06908888
## ENSG00000277196 0.87097036
table(is.na(res_model_system$padj))
## 
## FALSE  TRUE 
## 23736  2099
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("Model system")
print(g0)

## association with condition
dds_lrt = DESeq(dds, test="LRT", reduced = ~ model_system + log_q75)
res_lrt = results(dds_lrt)

res_condition = as.data.frame(res_lrt)
dim(res_condition)
## [1] 25835     6
res_condition[1:2,]
##                  baseMean log2FoldChange     lfcSE      stat     pvalue
## ENSG00000271254 731.89279      0.0114876 0.2318513 0.2926949 0.58849873
## ENSG00000277196  46.50349      0.9817827 1.3388605 2.9420549 0.08630088
##                      padj
## ENSG00000271254 0.6952827
## ENSG00000277196 0.1494475
table(is.na(res_condition$padj))
## 
## FALSE  TRUE 
## 22739  3096
g0 = ggplot(res_condition %>% subset(!is.na(padj)), aes(x=pvalue)) + 
  geom_histogram(color="darkblue", fill="lightblue", 
                 breaks = seq(0,1,by=0.01)) + 
  ggtitle("Condition")
print(g0)

Analyze samples of different model systems or conditions

For “PrS_nor”, the issue of mismatch among CE, KO and PTC between what is got from the “TARGETED GENOMIC REGION” column in flat metadata file and what is got from the filenames is still not solved, as of September 25, 2024. Currently, run it according to what is got from the “TARGETED GENOMIC REGION” column in flat metadata file. Update it later once the mismatch is solved

Explore the PC plots first, to decide running the models on what level of DE group.

Then run the DE analysis.

meta$model_condition = paste(meta$model_system, meta$condition, sep="_")
table(meta$model_condition)
## 
## ExM_nor PrS_hyp PrS_nor 
##      12      12      66
table(meta$run_id, meta$model_condition)
##                              
##                               ExM_nor PrS_hyp PrS_nor
##   20230809_23-robson-005-run2       0       0      30
##   20230918_23-robson-008            0      12      12
##   20230918_23-robson-009            0       0      12
##   20230918_23-robson-010            0       0      12
##   20231004_23-robson-011           12       0       0
unique_model_condition_ss = unique(meta$model_condition)
unique_model_condition_ss = sort(unique_model_condition_ss)

for(cur_mc in unique_model_condition_ss){
  
  print(cur_mc)
  sample2kp = which(meta$model_condition == cur_mc)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  print(table(meta_m$ko_strategy, str_extract(colnames(cts_dat_m), "^[^_]+_[^_]+")))
  extracted_ko_strings = str_extract_all(colnames(cts_dat_m), "CE|KO|PTC|WT")
  print(table(sapply(extracted_ko_strings, length)))
  print(table(meta_m$ko_strategy, unlist(extracted_ko_strings)))
  
  if (cur_mc == "PrS_nor"){
    print("The overlap between WT and KO is not real overlap. It is due to the gene name contains substring 'KO' in these three samples:")
    print(meta_m$file_id[(meta_m$ko_strategy=="WT") & (unlist(extracted_ko_strings)=="KO")])
  }
  
  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  q75 = apply(cts_dat_m, 1, quantile, probs=0.75)

  cts_dat_n = t(cts_dat_m[q75 > 0,])
  cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
  dim(cts_dat_n)
  
  sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
  names(sample_pca)
  pc_scores = sample_pca$x
  eigen_vals = sample_pca$sdev^2
  eigen_vals[1:5]/sum(eigen_vals)
  pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
  pvs
  
  PC_df = data.frame(pc_scores)
  PC_df = cbind(PC_df, meta_m)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
    geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
      scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  
  
  gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene, 
                          alpha = ifelse(ko_gene == "WT", 1, 0.8))) +
    geom_point(size=2.5) + 
    scale_color_brewer(palette = "Dark2") + 
    scale_shape_manual(values = c(7, 15, 16, 17)) + 
    xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) + 
    ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) + 
    ggtitle(cur_mc) + guides(alpha = "none") + 
    guides(
      color = guide_legend(title = NULL, order = 1),
      shape = guide_legend(title = NULL, order = 2)
    ) +
    theme(
      legend.margin = margin(0, 0, 0, 0), 
      legend.box.just = "left", 
      legend.direction = "vertical" 
    )

  print(gPC)
  table(meta_m$run_id, meta_m$ko_gene)
  
}
## [1] "ExM_nor"
##      
##       ISL1_CE ISL1_KO ISL1_PTC ISL1_WT
##   CE        3       0        0       0
##   KO        0       3        0       0
##   PTC       0       0        3       0
##   WT        0       0        0       3
## 
##  1 
## 12 
##      
##       CE KO PTC WT
##   CE   3  0   0  0
##   KO   0  3   0  0
##   PTC  0  0   3  0
##   WT   0  0   0  3

## [1] "PrS_hyp"
##      
##       CE_E06 CE_G07 CE_H06 KO_A03 KO_B01 KO_B02 PTC_A09 PTC_D10 PTC_G09 WT_1
##   CE       1      1      1      0      0      0       0       0       0    0
##   KO       0      0      0      1      1      1       0       0       0    0
##   PTC      0      0      0      0      0      0       1       1       1    0
##   WT       0      0      0      0      0      0       0       0       0    1
##      
##       WT_2 WT_3
##   CE     0    0
##   KO     0    0
##   PTC    0    0
##   WT     1    1
## 
##  1 
## 12 
##      
##       CE KO PTC WT
##   CE   3  0   0  0
##   KO   0  3   0  0
##   PTC  0  0   3  0
##   WT   0  0   0  3

## [1] "PrS_nor"
##      
##       CE_E06 CE_G07 CE_H06 FOSB_CE FOSB_KO FOSB_PTC GCM1_CE GCM1_KO GCM1_PTC
##   CE       1      1      1       0       0        3       3       0        0
##   KO       0      0      0       3       0        0       0       3        0
##   PTC      0      0      0       0       3        0       0       0        3
##   WT       0      0      0       0       0        0       0       0        0
##      
##       GCM1_WT GHRL1_CE GHRL1_KO GHRL1_PTC KO_A03 KO_B01 KO_B02 KOLF2_FOSB
##   CE        0        3        0         0      0      0      0          0
##   KO        0        0        3         0      1      1      1          0
##   PTC       0        0        0         3      0      0      0          0
##   WT        3        0        0         0      0      0      0          1
##      
##       KOLF2_GHRL1 KOLF2_POU2F3 POU2F3_CE POU2F3_KO POU2F3_PTC PPARG_CE PPARG_KO
##   CE            0            0         3         0          0        3        0
##   KO            0            0         0         3          0        0        3
##   PTC           0            0         0         0          3        0        0
##   WT            1            1         0         0          0        0        0
##      
##       PPARG_PTC PPARG_WT PTC_A09 PTC_D10 PTC_G09 WT_1 WT_2 WT_3
##   CE          0        0       0       0       0    0    0    0
##   KO          0        0       0       0       0    0    0    0
##   PTC         3        0       1       1       1    0    0    0
##   WT          0        3       0       0       0    1    1    1
## 
##  1 
## 66 
##      
##       CE KO PTC WT
##   CE  15  0   3  0
##   KO   3 15   0  0
##   PTC  0  3  15  0
##   WT   0  3   0  9
## [1] "The overlap between WT and KO is not real overlap. It is due to the gene name contains substring 'KO' in these three samples:"
## [1] "KOLF2_GHRL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001"  
## [2] "KOLF2_FOSB_1_GT23-10493_GACCTGAA-CTCACCAA_S32_L001"  
## [3] "KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001"

Use each combination of (model system, condition) as DE group.

Run analysis for each DE group separately.

For the output files, create two versions, one with direct information, one with padj set to NA for the genes when the number of 0 per test is >=4.

In more details, for both versions of the outputs, make it one file per knockout gene per DE group (DE group can be model system, model system + condition, model system + run_id, model system + day, etc.). The first version of the output files contains gene id, and for each knockout strategy:

baseMean, log2FoldChange, lfcSE, stat, pvalue, padj

The second version of the output files contains gene id,

chr, strand, position, gene name

and for each knockout strategy:

log2FoldChange, pvalue, padj (set to be NA if the number of 0 per test >= 4), 

In addition, save out a separate file of the sample size for each comparison. This needs to be by DE group.

df_size = NULL

release_name = "2023_12_JAX_RNAseq_ExtraEmbryonic"

output_dir = file.path("results", release_name)
raw_output_dir = file.path(output_dir, "raw")
processed_output_dir = file.path(output_dir, "processed")

if (!file.exists(raw_output_dir)){
  dir.create(raw_output_dir, recursive = TRUE)
}

if (!file.exists(processed_output_dir)){
  dir.create(processed_output_dir, recursive = TRUE)
}


unique_model_ss = unique(meta$model_system)
unique_model_ss = sort(unique_model_ss)
unique_model_ss
## [1] "ExM" "PrS"
for (model1 in unique_model_ss){
  
  print(model1)
  sample2kp = which(meta$model_system == model1)
  cts_dat_m = cts_dat[,sample2kp]
  meta_m    = meta[sample2kp,]

  stopifnot(all(meta_m$file_id == colnames(cts_dat_m)))
  print(table(meta_m$file_id == colnames(cts_dat_m), useNA="ifany"))  
  
  ## Generate sample information matrix
  cols2kp = c("model_system", "condition", "ko_strategy", "ko_gene", "run_id", 
              "run_id_short", "q75")
  sample_info = meta_m[,cols2kp]
    
  dim(sample_info)
  sample_info[1:2,]
      
  print(table(sample_info$ko_strategy, sample_info$ko_gene))
  
  sample_info$ko_group = paste(sample_info$ko_gene, 
                               sample_info$ko_strategy, sep="_")
  print(table(sample_info$ko_group))
  
  # use the combination of (model system, condition) as DE group
  
  sample_info$de_grp = paste0(sample_info$model_system, "_", 
                              sample_info$condition)

  sorted_de_grps = sort(unique(sample_info$de_grp))
  sorted_de_grps

  for(d_group in sorted_de_grps){
    
    print(d_group)
    
    dg_index = which(sample_info$de_grp==d_group)
    cts_dat_m_dg = cts_dat_m[, dg_index]
    sample_info_dg = sample_info[dg_index, ]

    stopifnot(all(sample_info_dg$de_grp==d_group))   
    
    print("-----------------------------------")      
    print(sprintf("DE analysis group %s DESeq2 results:", d_group))
    print("-----------------------------------")  
      

    # do two steps of filtering
    # first, filter based on q75 of each gene
    # second, filter based on whether each gene is expressed in at least 4 cells
  
    dg_q75 = apply(cts_dat_m_dg, 1, quantile, probs=0.75)
    cts_dat_m_dg = cts_dat_m_dg[dg_q75 > 0,]
    
    print(sprintf("After filtering by gene expression q75, the number of genes reduces from %d to %d", 
                  length(dg_q75), nrow(cts_dat_m_dg)))
    
    n_pos = rowSums(cts_dat_m_dg>0)
    cts_dat_m_dg = cts_dat_m_dg[n_pos >= 4,]
    
    if (length(n_pos)==nrow(cts_dat_m_dg)){
      print("After filtering by nonzero expression in at least 4 samples, the number of genes does not change.")
    }else{
      print(sprintf("After filtering by nonzero expression in at least 4 samples, the number of genes reduces from %d to %d", 
                    length(n_pos), nrow(cts_dat_m_dg)))    
    }
  
    # update the q75 based on only genes used in the process
    sample_info_dg$q75 = apply(cts_dat_m_dg, 2, quantile, probs = 0.75)
    sample_info_dg$log_q75 = log(sample_info_dg$q75)   

    
    if (length(unique(sample_info_dg$run_id))==1){
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + log_q75)
    }else{
      dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                    ~ ko_group + run_id + log_q75)     
    }
    
    start.time <- Sys.time()
    dds = DESeq(dds)
    end.time <- Sys.time()
    
    time.taken <- end.time - start.time
    print(time.taken)
    
    ## association with read-depth
    res_rd = results(dds, name = "log_q75")
    res_rd = as.data.frame(res_rd)
    dim(res_rd)
    res_rd[1:2,]
    
    table(is.na(res_rd$padj))
    
    g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) + 
      geom_histogram(color="darkblue", fill="lightblue", 
                     breaks = seq(0,1,by=0.01)) + 
      ggtitle(paste0(d_group, " read depth"))
    print(g0)
    
    ## Rerun the analysis without read-depth if it is not significant for 
    ## most genes, and the number of samples is small
    ## i.e., proportion of non-null in the distribution is smaller than 0.01
    ## (this needs to be restricted to the genes with not NA adjusted pvalue)
    ## or if smaller than 0.1 and the number of samples is not greater than 6

    pi_1_rd = max(0, mean(res_rd$pvalue[!is.na(res_rd$padj)] < 0.05) - 0.05)
    pi_1_rd = max(pi_1_rd, 0, 1 - 2*mean(res_rd$pvalue[!is.na(res_rd$padj)] > 0.5))
    
    print(sprintf("prop of non-null for rd: %.2e", pi_1_rd))
    
    if(pi_1_rd < 0.01 || (ncol(cts_dat_m_dg) <= 6 && pi_1_rd < 0.1 )){
      
      print("rerun DESeq2 without read depth")
      
      include_rd = 0
      if (length(unique(sample_info_dg$run_id))==1){
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group)
      }else{
        dds = DESeqDataSetFromMatrix(cts_dat_m_dg, colData=sample_info_dg, 
                                      ~ ko_group + run_id)     
      }
      start.time <- Sys.time()
      dds = DESeq(dds)
      end.time <- Sys.time()
      
      time.taken <- end.time - start.time
      print(time.taken)
      
    }else{
      include_rd = 1
    }
    
    
     ## association with run_id
    
    if (length(unique(sample_info_dg$run_id))>1){
      
      if(include_rd){
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
      }else{
        dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group)
      }
      
      res_lrt = results(dds_lrt)
    
      res_run_id = as.data.frame(res_lrt)
      dim(res_run_id)
      res_run_id[1:2,]
    
      table(is.na(res_run_id$padj))
      
      g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) + 
        geom_histogram(color="darkblue", fill="lightblue", 
                       breaks = seq(0,1,by=0.01)) + 
        ggtitle(paste0(d_group, " Run ID"))
      print(g0)
      
    }   
    
    ## DE analysis for each KO gene
    table(sample_info_dg$ko_gene)
    table(sample_info_dg$ko_gene, sample_info_dg$ko_strategy)
    
    genos = setdiff(unique(sample_info_dg$ko_gene), "WT")
    genos
    
    ko_grp  = c("CE", "KO", "PTC")
      
    for(geno in genos){
      
      res_geno_df = NULL
      res_geno_reduced_df = NULL
      res_geno = list()
      
      for(k_grp1 in ko_grp){
        
        res_g1 = results(dds, contrast = c("ko_group", 
                                           paste0(geno, "_", k_grp1), "WT_WT"))
        res_g1 = signif(data.frame(res_g1), 3)
        
        # add a column to record the number of zeros per test
        test_index = which(sample_info_dg$ko_group%in%c(paste0(geno, "_", k_grp1), "WT_WT"))
        
        cts_dat_m_dg_test = cts_dat_m_dg[, test_index]
        n_zero = rowSums(cts_dat_m_dg_test==0)
        res_g1$n_zeros = n_zero
        
        # record the number of samples involved in the comparison
        test_ko_group_vec = sample_info_dg$ko_group[test_index]
        
        df_size = rbind(df_size, 
                        c(d_group, 
                          paste0(geno, "_", k_grp1), 
                          sum(test_ko_group_vec!="WT_WT"), 
                          sum(test_ko_group_vec=="WT_WT")))
        
        # prepare a processed version of output
        res_g1_reduced = res_g1[, c("log2FoldChange", "pvalue", "padj", "n_zeros")]
        res_g1_reduced$padj[which(res_g1_reduced$n_zeros>=4)] = NA
        
        if ((sum(test_ko_group_vec!="WT_WT")==1)|(sum(test_ko_group_vec=="WT_WT")==1)){
          res_g1_reduced$padj = NA
        }
        
        res_g1_reduced = res_g1_reduced[, c("log2FoldChange", "pvalue", "padj")]
        
        res_geno[[k_grp1]] = res_g1_reduced
        
        if ((sum(test_ko_group_vec!="WT_WT")>1) & (sum(test_ko_group_vec=="WT_WT")>1)){
          g1 = ggplot(res_g1_reduced %>% subset(!is.na(padj)), aes(x=pvalue)) + 
            geom_histogram(color="darkblue", fill="lightblue", 
                           breaks = seq(0,1,by=0.01)) + 
            ggtitle(paste0(d_group, " ", geno, "_", k_grp1))
          print(g1)
        }

        
        tag1 = sprintf("%s_%s_", geno, k_grp1)
        colnames(res_g1) = paste0(tag1, colnames(res_g1))
        colnames(res_g1_reduced) = paste0(tag1, colnames(res_g1_reduced))      
        
        if(is.null(res_geno_df)){
          res_geno_df = res_g1
        }else if(!is.null(res_geno_df)){
          stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
          res_geno_df = cbind(res_geno_df, res_g1)
        }

        if(is.null(res_geno_reduced_df)){
          res_geno_reduced_df = res_g1_reduced
        }else if(!is.null(res_geno_reduced_df)){
          stopifnot(all(rownames(res_geno_reduced_df) == rownames(res_g1_reduced)))
          res_geno_reduced_df = cbind(res_geno_reduced_df, res_g1_reduced)
        }       

      }
      
      get_index <- function(x){
        which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
      }
      
      w_ce = get_index(res_geno[["CE"]])
      w_ko = get_index(res_geno[["KO"]])
      w_ptc = get_index(res_geno[["PTC"]])
 
      print(paste0("DE group ", d_group, " under KO gene ", geno, ":"))
      
      print(paste0("number of DE genes from knock out strategy CE: ", 
                   as.character(length(w_ce))))
      print(paste0("number of DE genes from knock out strategy KO: ", 
                   as.character(length(w_ko))))
      print(paste0("number of DE genes from knock out strategy PTC: ", 
                   as.character(length(w_ptc))))

      ce_ko_overlap = length(intersect(rownames(res_geno[["CE"]])[w_ce], 
                                       rownames(res_geno[["KO"]])[w_ko]))
      
      ko_ptc_overlap = length(intersect(rownames(res_geno[["KO"]])[w_ko], 
                                        rownames(res_geno[["PTC"]])[w_ptc]))
      
      ptc_ce_overlap = length(intersect(rownames(res_geno[["PTC"]])[w_ptc], 
                                        rownames(res_geno[["CE"]])[w_ce]))
      
      print(paste0("number of common DE genes by knock out strategies CE and KO: ", 
                   as.character(ce_ko_overlap)))
      print(paste0("number of common DE genes by knock out strategies KO and PTC: ", 
                   as.character(ko_ptc_overlap)))
      print(paste0("number of common DE genes by knock out strategies PTC and CE: ", 
                   as.character(ptc_ce_overlap)))

      if (max(length(w_ce), length(w_ko), length(w_ptc)) > 0){
        m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce], 
                               "KO" = rownames(res_geno[["KO"]])[w_ko], 
                               "PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
        g_up = UpSet(m)
        print(g_up)
      }
      
      df1 = data.frame(padj_CE = res_geno[["CE"]]$padj, 
                       padj_KO = res_geno[["KO"]]$padj, 
                       padj_PTC = res_geno[["PTC"]]$padj)
  
      cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
      cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
      
      g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                            y = -log10(padj_CE))) +
        geom_pointdensity() + 
        ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr1)) + 
        scale_color_viridis()
      
      g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC), 
                                            y = -log10(padj_KO))) +
        geom_pointdensity() + 
        ggtitle(sprintf("%s, %s\nSpearman r = %.2f", d_group, geno, cr2)) + 
        scale_color_viridis()
      
      print(g4)
      print(g5)

    
      dim(res_geno_df)
      res_geno_df[1:2,]
      
      dim(res_geno_reduced_df)
      res_geno_reduced_df[1:2,]
      
      res_df = data.frame(gene_ID=rownames(res_geno_df), 
                          res_geno_df)
      dim(res_df)
      res_df[1:2,]
  
      res_reduced_gene_anno = gene_anno[match(rownames(res_geno_reduced_df), 
                                              gene_anno$ensembl_gene_id), ]
      stopifnot(all(rownames(res_geno_reduced_df)==res_reduced_gene_anno$ensembl_gene_id))
  
      # set gene hgnc_symbol that equal "" to NA
      hgnc_symbol_vec = res_reduced_gene_anno$hgnc_symbol
      hgnc_symbol_vec[which(hgnc_symbol_vec=="")] = NA
        
      res_reduced_df = data.frame(gene_ID=rownames(res_geno_reduced_df), 
                                  hgnc_symbol=hgnc_symbol_vec,
                                  chr=res_reduced_gene_anno$chr, 
                                  strand=res_reduced_gene_anno$strand, 
                                  start=res_reduced_gene_anno$start, 
                                  end=res_reduced_gene_anno$end, 
                                  res_geno_reduced_df)
      
      print("hgnc_symbol empty string and NA tables:")
      print(table(res_reduced_df$hgnc_symbol=="", useNA="ifany"))
      print(table(is.na(res_reduced_df$hgnc_symbol)))
        
      dim(res_reduced_df)
      res_reduced_df[1:2,]
  
      fwrite(res_df, 
             sprintf("%s/%s_%s_%s_DEseq2_raw.tsv", 
                     raw_output_dir, release_name, d_group, geno), 
             sep="\t")
      fwrite(res_reduced_df, 
             sprintf("%s/%s_%s_%s_DEseq2.tsv", 
                     processed_output_dir, release_name, d_group, geno), 
             sep="\t")
    
    }
    
  }

}
## [1] "ExM"
## 
## TRUE 
##   12 
##      
##       ISL1 WT
##   CE     3  0
##   KO     3  0
##   PTC    3  0
##   WT     0  3
## 
##  ISL1_CE  ISL1_KO ISL1_PTC    WT_WT 
##        3        3        3        3 
## [1] "ExM_nor"
## [1] "-----------------------------------"
## [1] "DE analysis group ExM_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 24876"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 24876 to 23842"
## Time difference of 1.585154 mins

## [1] "prop of non-null for rd: 0.00e+00"
## [1] "rerun DESeq2 without read depth"
## Time difference of 8.388163 secs

## [1] "DE group ExM_nor under KO gene ISL1:"
## [1] "number of DE genes from knock out strategy CE: 652"
## [1] "number of DE genes from knock out strategy KO: 405"
## [1] "number of DE genes from knock out strategy PTC: 432"
## [1] "number of common DE genes by knock out strategies CE and KO: 343"
## [1] "number of common DE genes by knock out strategies KO and PTC: 324"
## [1] "number of common DE genes by knock out strategies PTC and CE: 380"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18935  4907 
## 
## FALSE  TRUE 
## 18935  4907 
## [1] "PrS"
## 
## TRUE 
##   78 
##      
##       EPAS1 FOSB GCM1 GRHL1 POU2F3 PPARG WT
##   CE      6    3    3     3      3     3  0
##   KO      6    3    3     3      3     3  0
##   PTC     6    3    3     3      3     3  0
##   WT      0    0    0     0      0     0 15
## 
##   EPAS1_CE   EPAS1_KO  EPAS1_PTC    FOSB_CE    FOSB_KO   FOSB_PTC    GCM1_CE 
##          6          6          6          3          3          3          3 
##    GCM1_KO   GCM1_PTC   GRHL1_CE   GRHL1_KO  GRHL1_PTC  POU2F3_CE  POU2F3_KO 
##          3          3          3          3          3          3          3 
## POU2F3_PTC   PPARG_CE   PPARG_KO  PPARG_PTC      WT_WT 
##          3          3          3          3         15 
## [1] "PrS_hyp"
## [1] "-----------------------------------"
## [1] "DE analysis group PrS_hyp DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23911"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes reduces from 23911 to 23326"
## Time difference of 9.515882 secs

## [1] "prop of non-null for rd: 8.49e-02"

## [1] "DE group PrS_hyp under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 4196"
## [1] "number of DE genes from knock out strategy KO: 2140"
## [1] "number of DE genes from knock out strategy PTC: 2553"
## [1] "number of common DE genes by knock out strategies CE and KO: 1883"
## [1] "number of common DE genes by knock out strategies KO and PTC: 1628"
## [1] "number of common DE genes by knock out strategies PTC and CE: 1960"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18654  4672 
## 
## FALSE  TRUE 
## 18654  4672 
## [1] "PrS_nor"
## [1] "-----------------------------------"
## [1] "DE analysis group PrS_nor DESeq2 results:"
## [1] "-----------------------------------"
## [1] "After filtering by gene expression q75, the number of genes reduces from 25835 to 23493"
## [1] "After filtering by nonzero expression in at least 4 samples, the number of genes does not change."
## Time difference of 1.400877 mins

## [1] "prop of non-null for rd: 1.30e-01"

## [1] "DE group PrS_nor under KO gene POU2F3:"
## [1] "number of DE genes from knock out strategy CE: 2038"
## [1] "number of DE genes from knock out strategy KO: 590"
## [1] "number of DE genes from knock out strategy PTC: 8"
## [1] "number of common DE genes by knock out strategies CE and KO: 462"
## [1] "number of common DE genes by knock out strategies KO and PTC: 6"
## [1] "number of common DE genes by knock out strategies PTC and CE: 5"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_nor under KO gene PPARG:"
## [1] "number of DE genes from knock out strategy CE: 4503"
## [1] "number of DE genes from knock out strategy KO: 3029"
## [1] "number of DE genes from knock out strategy PTC: 4014"
## [1] "number of common DE genes by knock out strategies CE and KO: 2690"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2334"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3173"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_nor under KO gene GCM1:"
## [1] "number of DE genes from knock out strategy CE: 5299"
## [1] "number of DE genes from knock out strategy KO: 4488"
## [1] "number of DE genes from knock out strategy PTC: 4255"
## [1] "number of common DE genes by knock out strategies CE and KO: 4014"
## [1] "number of common DE genes by knock out strategies KO and PTC: 3770"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3823"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_nor under KO gene FOSB:"
## [1] "number of DE genes from knock out strategy CE: 121"
## [1] "number of DE genes from knock out strategy KO: 409"
## [1] "number of DE genes from knock out strategy PTC: 847"
## [1] "number of common DE genes by knock out strategies CE and KO: 43"
## [1] "number of common DE genes by knock out strategies KO and PTC: 324"
## [1] "number of common DE genes by knock out strategies PTC and CE: 103"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_nor under KO gene EPAS1:"
## [1] "number of DE genes from knock out strategy CE: 1372"
## [1] "number of DE genes from knock out strategy KO: 40"
## [1] "number of DE genes from knock out strategy PTC: 304"
## [1] "number of common DE genes by knock out strategies CE and KO: 37"
## [1] "number of common DE genes by knock out strategies KO and PTC: 29"
## [1] "number of common DE genes by knock out strategies PTC and CE: 236"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682

## [1] "DE group PrS_nor under KO gene GRHL1:"
## [1] "number of DE genes from knock out strategy CE: 3439"
## [1] "number of DE genes from knock out strategy KO: 2483"
## [1] "number of DE genes from knock out strategy PTC: 5095"
## [1] "number of common DE genes by knock out strategies CE and KO: 1980"
## [1] "number of common DE genes by knock out strategies KO and PTC: 2044"
## [1] "number of common DE genes by knock out strategies PTC and CE: 3057"

## [1] "hgnc_symbol empty string and NA tables:"
## 
## FALSE  <NA> 
## 18811  4682 
## 
## FALSE  TRUE 
## 18811  4682
colnames(df_size) = c("DE_group", "knockout_gene_strategy", "n_ko", "n_WT")
fwrite(df_size, 
       sprintf("%s/%s_DE_n_samples.tsv", output_dir, release_name), 
       sep="\t")
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb)  max used  (Mb)
## Ncells  8146656 435.1   12621695 674.1         NA  12621695 674.1
## Vcells 40262056 307.2  101126503 771.6      65536 101126503 771.6
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] readxl_1.4.3                UpSetR_1.4.0               
##  [3] ComplexHeatmap_2.14.0       data.table_1.15.4          
##  [5] dplyr_1.1.4                 ggpointdensity_0.1.0       
##  [7] viridis_0.6.5               viridisLite_0.4.2          
##  [9] DESeq2_1.38.3               SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0              MatrixGenerics_1.10.0      
## [13] matrixStats_1.3.0           GenomicRanges_1.50.2       
## [15] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [17] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [19] RColorBrewer_1.1-3          MASS_7.3-60                
## [21] stringr_1.5.1               ggpubr_0.6.0               
## [23] ggrepel_0.9.5               ggplot2_3.5.1              
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-8           bit64_4.0.5            doParallel_1.0.17     
##  [4] httr_1.4.7             tools_4.2.3            backports_1.5.0       
##  [7] bslib_0.8.0            utf8_1.2.4             R6_2.5.1              
## [10] DBI_1.2.3              colorspace_2.1-1       GetoptLong_1.0.5      
## [13] withr_3.0.1            tidyselect_1.2.1       gridExtra_2.3         
## [16] bit_4.0.5              compiler_4.2.3         cli_3.6.3             
## [19] Cairo_1.6-2            DelayedArray_0.24.0    labeling_0.4.3        
## [22] sass_0.4.9             scales_1.3.0           digest_0.6.37         
## [25] rmarkdown_2.28         XVector_0.38.0         pkgconfig_2.0.3       
## [28] htmltools_0.5.8.1      highr_0.11             fastmap_1.2.0         
## [31] GlobalOptions_0.1.2    rlang_1.1.4            rstudioapi_0.16.0     
## [34] RSQLite_2.3.7          farver_2.1.2           shape_1.4.6.1         
## [37] jquerylib_0.1.4        generics_0.1.3         jsonlite_1.8.8        
## [40] BiocParallel_1.32.6    car_3.1-2              RCurl_1.98-1.16       
## [43] magrittr_2.0.3         GenomeInfoDbData_1.2.9 Matrix_1.5-4.1        
## [46] Rcpp_1.0.13            munsell_0.5.1          fansi_1.0.6           
## [49] abind_1.4-5            lifecycle_1.0.4        stringi_1.8.4         
## [52] yaml_2.3.10            carData_3.0-5          zlibbioc_1.44.0       
## [55] plyr_1.8.9             blob_1.2.4             parallel_4.2.3        
## [58] crayon_1.5.3           lattice_0.22-6         Biostrings_2.66.0     
## [61] annotate_1.76.0        circlize_0.4.16        KEGGREST_1.38.0       
## [64] locfit_1.5-9.10        knitr_1.48             pillar_1.9.0          
## [67] rjson_0.2.21           ggsignif_0.6.4         geneplotter_1.76.0    
## [70] codetools_0.2-20       XML_3.99-0.17          glue_1.7.0            
## [73] evaluate_0.24.0        foreach_1.5.2          vctrs_0.6.5           
## [76] png_0.1-8              cellranger_1.1.0       gtable_0.3.5          
## [79] purrr_1.0.2            tidyr_1.3.1            clue_0.3-65           
## [82] cachem_1.1.0           xfun_0.47              xtable_1.8-4          
## [85] broom_1.0.6            rstatix_0.7.2          tibble_3.2.1          
## [88] iterators_1.0.14       AnnotationDbi_1.60.2   memoise_2.0.1         
## [91] cluster_2.1.6